Analysis 2

Final HTML file

Author

Sarah Rosenberg Asmussen (s194689), Mette Bøge Pedersen (s194679), Caroline Amalie Bastholm Jensen (s213427), Jaime Noguera Piera (s233773), Yassine Turki (s231735)

Project Title

In this project, we investigate a gene expression dataset for patients with bladder cancer. Our data was acquired from the National Center for Biotechnology Information https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE31684. It consists of microarray data from 93 bladder cancer patients. Our goal in this project is to investigate the correlation between gene expression and metastasis occurrence. We are also interested in assessing the relevance of smoking when looking at metastasis, and we looked at the regulation of the genes of this population when it comes to metastasis.

Creating data folder if not already exists

If the data folder and subfolder _raw does not already exist, this code will create it.

data_dir <- "../data/"
raw_dir <- "../data/_raw/"

if( !dir.exists(data_dir)){
  dir.create(path = data_dir)
}

if( !dir.exists(raw_dir)){
  dir.create(path = raw_dir)
}

Download metadata

metadata_file <- "GSE31684_meta_data.tsv"
metadata_loc <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE31nnn/GSE31684/suppl/GSE31684%5Ftable%5Fof%5Fclinical%5Fdetails.txt.gz"

if( !file.exists(str_c(raw_dir, metadata_file)) ){
  download.file(
    url = metadata_loc,
    destfile = str_c(raw_dir, metadata_file))
}
meta_data <- read_tsv(gzfile("../data/_raw/GSE31684_meta_data.tsv"),  show_col_types = FALSE)

meta_data
# A tibble: 93 × 24
   GEO       ID     Gender PreOpClinStage `Age at RC` `Survival Months` RC_Stage
   <chr>     <chr>  <chr>  <chr>                <dbl>             <dbl> <chr>   
 1 GSM786491 1_pT2  male   T3                    64.1            104.   pT2     
 2 GSM786492 2_pT2  male   T2                    62.1             13.2  pT2     
 3 GSM786493 3_pT2  female T2                    66.0             19.8  pT2     
 4 GSM786494 4_pT4  male   T2                    56.1             16.4  pT4     
 5 GSM786495 5_pT4  female T3                    68               13.1  pT4     
 6 GSM786496 6_pT3  male   T2                    69.9              4.44 pT3     
 7 GSM786497 7_pT1  male   T2                    65               87.9  pT1     
 8 GSM786498 8_pT3  male   T1                    69.4            173.   pT3     
 9 GSM786499 9_pT2  male   T2                    47.5            108.   pT2     
10 GSM786500 10_pT3 female T2                    77.9            176.   pT3     
# ℹ 83 more rows
# ℹ 17 more variables: RC_Histology <chr>, `PLND Result` <chr>,
#   `RC Grade` <chr>, `Nomogram Score` <dbl>, `Distant Mets` <dbl>,
#   `Local Recurrence` <dbl>, `Urothelial Recurrence` <dbl>, Metastasis <dbl>,
#   `Last known status` <chr>, PreRC_Chemo <chr>, `Post RC_Chemo` <chr>,
#   `PostChemo type` <chr>, Smoking <chr>, `SmokingPack-Years` <chr>,
#   `Recurrence Free Survival Months (Distant and Local)` <dbl>, …

Download gene expression data

# creating url
gene_url <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE31nnn/GSE31684/matrix/GSE31684_series_matrix.txt.gz"

# this is necessary to not get error message about connection buffer
Sys.setenv(VROOM_CONNECTION_SIZE=500000)

# Retrieve the data directly
# skippin first 82 lines because it is just info
gene_exp_file <- read_tsv(file = gene_url,
                          skip = 82)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 54676 Columns: 94
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (1): ID_REF
dbl (93): GSM786491, GSM786492, GSM786493, GSM786494, GSM786495, GSM786496, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Write the data to disk
write_tsv(x = gene_exp_file,
          file = "../data/_raw/gene_exp.tsv.gz")

gene_exp_data <- read_tsv(gzfile("../data/_raw/gene_exp.tsv.gz"))
Rows: 54676 Columns: 94
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (1): ID_REF
dbl (93): GSM786491, GSM786492, GSM786493, GSM786494, GSM786495, GSM786496, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gene_exp_data
# A tibble: 54,676 × 94
   ID_REF  GSM786491 GSM786492 GSM786493 GSM786494 GSM786495 GSM786496 GSM786497
   <chr>       <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
 1 1007_s…      6.80     11.3      11.0       8.95     11.7      11.7       9.88
 2 1053_at      8.39      8.16      8.01      8.15      8.16      7.99      7.96
 3 117_at       3.89      4.65      3.73      4.19      3.53      3.73      3.80
 4 121_at       5.43      5.43      5.42      5.49      5.43      5.43      5.43
 5 1255_g…      2.23      2.23      2.23      2.23      2.23      2.23      2.23
 6 1294_at      2.75      5.74      3.05      2.85      3.05      5.42      3.21
 7 1316_at      2.69      2.72      2.69      2.69      2.69      2.69      2.99
 8 1320_at      2.23      2.23      2.23      2.23      2.23      2.23      2.23
 9 1405_i…      6.71     13.0       6.96      6.90      4.83      4.83     12.9 
10 1431_at      2.23      2.23      2.23      2.23      2.23      2.23      2.23
# ℹ 54,666 more rows
# ℹ 86 more variables: GSM786498 <dbl>, GSM786499 <dbl>, GSM786500 <dbl>,
#   GSM786501 <dbl>, GSM786502 <dbl>, GSM786503 <dbl>, GSM786504 <dbl>,
#   GSM786505 <dbl>, GSM786506 <dbl>, GSM786507 <dbl>, GSM786508 <dbl>,
#   GSM786509 <dbl>, GSM786510 <dbl>, GSM786511 <dbl>, GSM786512 <dbl>,
#   GSM786513 <dbl>, GSM786514 <dbl>, GSM786515 <dbl>, GSM786516 <dbl>,
#   GSM786517 <dbl>, GSM786518 <dbl>, GSM786519 <dbl>, GSM786520 <dbl>, …
# Removed last line:
gene_exp_data <- gene_exp_data |> 
  filter(!str_detect(ID_REF, "^!"))
# Transpose the data table:
df_long <- gene_exp_data |>
  pivot_longer(cols = -ID_REF, names_to = "GEO", values_to = "Value")

# Transpose the dataframe from long to wide format
df_wide <- df_long |>
  pivot_wider(names_from = ID_REF, values_from = Value)
# Join metadata and gene expression data based on patient ID:
data_joined = full_join(meta_data, df_wide, by = join_by(GEO))
write_tsv(data_joined, "../data/01_dat_load.tsv.gz")

Load Data

Sys.setenv(VROOM_CONNECTION_SIZE=5000000)
data <- read_tsv(gzfile("../data/01_dat_load.tsv.gz"))
head(data)
# A tibble: 6 × 54,699
  GEO       ID    Gender PreOpClinStage `Age at RC` `Survival Months` RC_Stage
  <chr>     <chr> <chr>  <chr>                <dbl>             <dbl> <chr>   
1 GSM786491 1_pT2 male   T3                    64.1            104.   pT2     
2 GSM786492 2_pT2 male   T2                    62.1             13.2  pT2     
3 GSM786493 3_pT2 female T2                    66.0             19.8  pT2     
4 GSM786494 4_pT4 male   T2                    56.1             16.4  pT4     
5 GSM786495 5_pT4 female T3                    68               13.1  pT4     
6 GSM786496 6_pT3 male   T2                    69.9              4.44 pT3     
# ℹ 54,692 more variables: RC_Histology <chr>, `PLND Result` <chr>,
#   `RC Grade` <chr>, `Nomogram Score` <dbl>, `Distant Mets` <dbl>,
#   `Local Recurrence` <dbl>, `Urothelial Recurrence` <dbl>, Metastasis <dbl>,
#   `Last known status` <chr>, PreRC_Chemo <chr>, `Post RC_Chemo` <chr>,
#   `PostChemo type` <chr>, Smoking <chr>, `SmokingPack-Years` <chr>,
#   `Recurrence Free Survival Months (Distant and Local)` <dbl>,
#   `Recurrence/DOD` <chr>, Cluster <dbl>, `1007_s_at` <dbl>, …

Clean Metadata

We are creating an overview table of the metadata in the describe doc, and for that we need to clean up the data a bit. We select two attributes relevant to our analysis: Metastasis and Smoking, as well as gene expressions

# Cleaning the data: 
data_clean <- data |> 
  relocate(Metastasis, Smoking) |>   
  select(!ID:Cluster)

data_clean
# A tibble: 93 × 54,678
   Metastasis Smoking GEO    `1007_s_at` `1053_at` `117_at` `121_at` `1255_g_at`
        <dbl> <chr>   <chr>        <dbl>     <dbl>    <dbl>    <dbl>       <dbl>
 1          0 Former  GSM78…        6.80      8.39     3.89     5.43        2.23
 2          1 Never   GSM78…       11.3       8.16     4.65     5.43        2.23
 3          0 Current GSM78…       11.0       8.01     3.73     5.42        2.23
 4          1 Former  GSM78…        8.95      8.15     4.19     5.49        2.23
 5          0 Former  GSM78…       11.7       8.16     3.53     5.43        2.23
 6          1 Former  GSM78…       11.7       7.99     3.73     5.43        2.23
 7          0 Current GSM78…        9.88      7.96     3.80     5.43        2.23
 8          0 Never   GSM78…        9.61      8.78     4.11     5.61        2.23
 9          1 Former  GSM78…       11.2       9.00     3.93     5.43        2.23
10          0 Former  GSM78…        9.69      9.54     8.88     5.43        2.23
# ℹ 83 more rows
# ℹ 54,670 more variables: `1294_at` <dbl>, `1316_at` <dbl>, `1320_at` <dbl>,
#   `1405_i_at` <dbl>, `1431_at` <dbl>, `1438_at` <dbl>, `1487_at` <dbl>,
#   `1494_f_at` <dbl>, `1552256_a_at` <dbl>, `1552257_a_at` <dbl>,
#   `1552258_at` <dbl>, `1552261_at` <dbl>, `1552263_at` <dbl>,
#   `1552264_a_at` <dbl>, `1552266_at` <dbl>, `1552269_at` <dbl>,
#   `1552271_at` <dbl>, `1552272_a_at` <dbl>, `1552274_at` <dbl>, …

Writing the data to be used in 04_describe

write_tsv(data_clean, "../data/02_dat_clean_wide.tsv.gz")

Long data format

We would like the data in a long format, where each gene and gene expression has it’s own row, for the following augmentation and analysis.

# Making the dataset long:
data_clean_long <- data_clean |>

  select(!GEO) |> 
  pivot_longer(cols=-c(Metastasis, Smoking),
               names_to = "gene",
               values_to = "gene_expression")   

data_clean_long
# A tibble: 5,084,775 × 4
   Metastasis Smoking gene      gene_expression
        <dbl> <chr>   <chr>               <dbl>
 1          0 Former  1007_s_at            6.80
 2          0 Former  1053_at              8.39
 3          0 Former  117_at               3.89
 4          0 Former  121_at               5.43
 5          0 Former  1255_g_at            2.23
 6          0 Former  1294_at              2.75
 7          0 Former  1316_at              2.69
 8          0 Former  1320_at              2.23
 9          0 Former  1405_i_at            6.71
10          0 Former  1431_at              2.23
# ℹ 5,084,765 more rows

We would like to drop the genes with no variability, as they will not provide useful information to our analysis.

# Selecting genes with different average gene expression based on the metastatic and non-metastatic cases:
data_clean_long <- data_clean_long |>
  group_by(gene) |>
  filter(sd(gene_expression)!=0)

  
data_clean_long
# A tibble: 4,550,676 × 4
# Groups:   gene [48,932]
   Metastasis Smoking gene      gene_expression
        <dbl> <chr>   <chr>               <dbl>
 1          0 Former  1007_s_at            6.80
 2          0 Former  1053_at              8.39
 3          0 Former  117_at               3.89
 4          0 Former  121_at               5.43
 5          0 Former  1255_g_at            2.23
 6          0 Former  1294_at              2.75
 7          0 Former  1316_at              2.69
 8          0 Former  1320_at              2.23
 9          0 Former  1405_i_at            6.71
10          0 Former  1431_at              2.23
# ℹ 4,550,666 more rows

Writing the data

write_tsv(data_clean_long, "../data/02_dat_clean.tsv.gz")

Data Load

data_clean <- read_tsv(gzfile("../data/02_dat_clean.tsv.gz"))
Rows: 4550676 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): Smoking, gene
dbl (2): Metastasis, gene_expression

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_clean
# A tibble: 4,550,676 × 4
   Metastasis Smoking gene      gene_expression
        <dbl> <chr>   <chr>               <dbl>
 1          0 Former  1007_s_at            6.80
 2          0 Former  1053_at              8.39
 3          0 Former  117_at               3.89
 4          0 Former  121_at               5.43
 5          0 Former  1255_g_at            2.23
 6          0 Former  1294_at              2.75
 7          0 Former  1316_at              2.69
 8          0 Former  1320_at              2.23
 9          0 Former  1405_i_at            6.71
10          0 Former  1431_at              2.23
# ℹ 4,550,666 more rows

Creating new columns

# We calculate the t-test p value of the gene_expression by comparring metastasis and non-metastasis for each gene:
data_aug <- data_clean |>
  group_by(gene) |>
  mutate(p_value = t.test(gene_expression[Metastasis == 1], 
                          gene_expression[Metastasis == 0])$p.value) |>
  mutate(is_significant = case_when(p_value <= 0.01 ~ "Yes",
                                    p_value > 0.01 ~ "No" ))
# We are adding the Log2-Fold-Change as an average value for each gene, and a Log2-Fold-Change as a value for each sample with metastasis for each gene.
 data_aug_log <- data_aug |>
   mutate(log2_fold_change_avg = log2(mean(gene_expression[Metastasis == 1]) / 
                                 mean(gene_expression[Metastasis == 0])),
          log2_fold_change_sample = ifelse(
            Metastasis == 0,
            NA,              
            log2(gene_expression / mean(gene_expression[Metastasis == 0]))))

 data_aug_log
# A tibble: 4,550,676 × 8
# Groups:   gene [48,932]
   Metastasis Smoking gene      gene_expression p_value is_significant
        <dbl> <chr>   <chr>               <dbl>   <dbl> <chr>         
 1          0 Former  1007_s_at            6.80   0.562 No            
 2          0 Former  1053_at              8.39   0.339 No            
 3          0 Former  117_at               3.89   0.905 No            
 4          0 Former  121_at               5.43   0.809 No            
 5          0 Former  1255_g_at            2.23   0.578 No            
 6          0 Former  1294_at              2.75   0.561 No            
 7          0 Former  1316_at              2.69   0.163 No            
 8          0 Former  1320_at              2.23   0.364 No            
 9          0 Former  1405_i_at            6.71   0.903 No            
10          0 Former  1431_at              2.23   0.215 No            
# ℹ 4,550,666 more rows
# ℹ 2 more variables: log2_fold_change_avg <dbl>, log2_fold_change_sample <dbl>

Write the data

write_tsv(data_aug_log, "../data/03_dat_aug.tsv.gz")

Data description

Data is available here: NCBI

The article published based on the data is available here: PMC

The title of the article is:

Combination of a novel gene expression signature with a clinical nomogram improves the prediction of survival in high-risk bladder cancer

A nomogram is a method to graphically depict a statistical prognostic model that generates a probability of a clinical event, such as cancer recurrence or death.

Urothelial carcinoma of the urinary bladder is the fifth most common cancer in the Western World, and represents 3% of cancers diagnosed globally.

About the data we use

Cancer gene expression data from 93 patients undergoing radical cystectomy (RC) between 1993 and 2004. Lymph node dissection was performed in 77 patients; no patient has metastatic disease at the time of RC. Metastatic disease means that the cancer has spread to a different part of the body than where it started. Case selection was restricted to those with frozen specimens with measurable volume of malignancy and adequate percentage of tumor.

Data load

data_describe <- read_tsv(
  gzfile("../data/02_dat_clean_wide.tsv.gz"),
  show_col_types = FALSE
)

data_describe
# A tibble: 93 × 54,678
   Metastasis Smoking GEO    `1007_s_at` `1053_at` `117_at` `121_at` `1255_g_at`
        <dbl> <chr>   <chr>        <dbl>     <dbl>    <dbl>    <dbl>       <dbl>
 1          0 Former  GSM78…        6.80      8.39     3.89     5.43        2.23
 2          1 Never   GSM78…       11.3       8.16     4.65     5.43        2.23
 3          0 Current GSM78…       11.0       8.01     3.73     5.42        2.23
 4          1 Former  GSM78…        8.95      8.15     4.19     5.49        2.23
 5          0 Former  GSM78…       11.7       8.16     3.53     5.43        2.23
 6          1 Former  GSM78…       11.7       7.99     3.73     5.43        2.23
 7          0 Current GSM78…        9.88      7.96     3.80     5.43        2.23
 8          0 Never   GSM78…        9.61      8.78     4.11     5.61        2.23
 9          1 Former  GSM78…       11.2       9.00     3.93     5.43        2.23
10          0 Former  GSM78…        9.69      9.54     8.88     5.43        2.23
# ℹ 83 more rows
# ℹ 54,670 more variables: `1294_at` <dbl>, `1316_at` <dbl>, `1320_at` <dbl>,
#   `1405_i_at` <dbl>, `1431_at` <dbl>, `1438_at` <dbl>, `1487_at` <dbl>,
#   `1494_f_at` <dbl>, `1552256_a_at` <dbl>, `1552257_a_at` <dbl>,
#   `1552258_at` <dbl>, `1552261_at` <dbl>, `1552263_at` <dbl>,
#   `1552264_a_at` <dbl>, `1552266_at` <dbl>, `1552269_at` <dbl>,
#   `1552271_at` <dbl>, `1552272_a_at` <dbl>, `1552274_at` <dbl>, …
data_clean_sd <- read_tsv(gzfile("../data/02_dat_clean.tsv.gz"),
                            show_col_types = FALSE)

data_clean_sd
# A tibble: 4,550,676 × 4
   Metastasis Smoking gene      gene_expression
        <dbl> <chr>   <chr>               <dbl>
 1          0 Former  1007_s_at            6.80
 2          0 Former  1053_at              8.39
 3          0 Former  117_at               3.89
 4          0 Former  121_at               5.43
 5          0 Former  1255_g_at            2.23
 6          0 Former  1294_at              2.75
 7          0 Former  1316_at              2.69
 8          0 Former  1320_at              2.23
 9          0 Former  1405_i_at            6.71
10          0 Former  1431_at              2.23
# ℹ 4,550,666 more rows

Counting number of genes

number_of_genes <- data_describe |> 
  select(c(GEO, ends_with("_at"))) |> 
  pivot_longer(cols = -GEO,
               names_to = "Genes",
               values_to = "Gene_expression") |> 
  distinct(Genes) |> 
  count()

number_of_genes
# A tibble: 1 × 1
      n
  <int>
1 54675

We have a total number of 54,675 genes for each patient in the original dataset. Then we also calculate the standard deviation for each gene, and filtered out all the genes with a standard deviation of 0 which means they were neither up-regulated nor down-regulated.

number_of_genes_after_sd <- data_clean_sd |> 
  distinct(gene) |> 
  count()

number_of_genes_after_sd
# A tibble: 1 × 1
      n
  <int>
1 48932

After removing these genes, we end up with 48,932 genes.

Table visualizing meta data

table_data <- data_describe |> 
  mutate(Metastasis = factor(Metastasis)) |> 
  table1(x = formula(~ Smoking + Metastasis), 
         data = _,
         caption = "Overview of number of patients who have metastasis and smoking status")

table_data
Overview of number of patients who have metastasis and smoking status
Overall
(N=93)
Smoking
Current 19 (20.4%)
Former 56 (60.2%)
Never 18 (19.4%)
Metastasis
0 57 (61.3%)
1 36 (38.7%)

In the table we can see that the majority of patients are have smoked for at period of time in their lives, though we do not know for how long and how frequently they smoked in that time. Then we have almost an equal amount for current smokers, and patients who have never smoked.

For metastasis, we see that ~60% do not have metastasis and the rest do.

Writing a dataset to be used in presentation

data_describe |>
  select(c(Smoking, Metastasis)) |> 
  write_tsv("../data/04_dat_desc.tsv.gz")

Data Load

data <- read_tsv(gzfile("../data/03_dat_aug.tsv.gz"))
Rows: 4550676 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): Smoking, gene, is_significant
dbl (5): Metastasis, gene_expression, p_value, log2_fold_change_avg, log2_fo...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data
# A tibble: 4,550,676 × 8
   Metastasis Smoking gene      gene_expression p_value is_significant
        <dbl> <chr>   <chr>               <dbl>   <dbl> <chr>         
 1          0 Former  1007_s_at            6.80   0.562 No            
 2          0 Former  1053_at              8.39   0.339 No            
 3          0 Former  117_at               3.89   0.905 No            
 4          0 Former  121_at               5.43   0.809 No            
 5          0 Former  1255_g_at            2.23   0.578 No            
 6          0 Former  1294_at              2.75   0.561 No            
 7          0 Former  1316_at              2.69   0.163 No            
 8          0 Former  1320_at              2.23   0.364 No            
 9          0 Former  1405_i_at            6.71   0.903 No            
10          0 Former  1431_at              2.23   0.215 No            
# ℹ 4,550,666 more rows
# ℹ 2 more variables: log2_fold_change_avg <dbl>, log2_fold_change_sample <dbl>

Analysis 1

In the first analysis, we want to identify which genes are found to be significantly different expressed in patients with metastatic cancer compared to non-metastatic cancer. Furthermore, we want to investigate if the gene expression is up-regulated of down-regulated.

The significance was calculated on the basis of a Student’s T-test where the expression of each gene was compared based on if the patients had metastasis or not.

The Log2 Fold Change for each gene was calculated based on the average gene expression level by comparing samples with metastasis and no metastasis.

Conclusion of first analysis:

These results are shown in a volcano plot where the -10log(p-value) is shown on the y-axis and the Log2-Fold-Change is hown on the x-axis. Each dot represents a gene. From this, we can observe some genes, specifically 286 genes out of 48,932 genes, that are significantly different expressed in patients with metastasis on a significance level of 0.01. We can also observe that the significant genes typically are more up-regulated or down-regulated.

volcano_plot <- data |> 
  select(gene, log2_fold_change_avg, p_value) |>
  unique() |>
  mutate(log_10_p = -log10(p_value),
         Significance = case_when(p_value > 0.01 ~ "Not significant",
                                    p_value <= 0.01 ~ "Significant")) |> 
  ggplot(mapping = aes(x = log2_fold_change_avg,
                       y = log_10_p,
                       color = Significance)) +
  geom_point(size = 1, alpha = 0.5) +
  geom_hline(yintercept=2,
             linetype="dotted", 
             color = "black", 
             size=0.5) +
  theme(legend.position = "none") + 
  theme_minimal() +
  labs(title="Genes Associated with Metastasis in Bladder Cancer", 
       subtitle = "Genes highlighted in turquoise are significant on a significance level of 0.01",
     x = "Log2 Fold Change",
     y = "-log10(p)") 
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
ggsave(
  filename = "../results/05_volcano_plot.png",
  plot = volcano_plot,
  device = "png",
  height = 5,
  dpi = 300,
  bg = "white"
)
Saving 7 x 5 in image
print(volcano_plot)

dev.off()
null device 
          1 

Data Load

data <- read_tsv(gzfile("../data/03_dat_aug.tsv.gz"))
Rows: 4550676 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): Smoking, gene, is_significant
dbl (5): Metastasis, gene_expression, p_value, log2_fold_change_avg, log2_fo...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data
# A tibble: 4,550,676 × 8
   Metastasis Smoking gene      gene_expression p_value is_significant
        <dbl> <chr>   <chr>               <dbl>   <dbl> <chr>         
 1          0 Former  1007_s_at            6.80   0.562 No            
 2          0 Former  1053_at              8.39   0.339 No            
 3          0 Former  117_at               3.89   0.905 No            
 4          0 Former  121_at               5.43   0.809 No            
 5          0 Former  1255_g_at            2.23   0.578 No            
 6          0 Former  1294_at              2.75   0.561 No            
 7          0 Former  1316_at              2.69   0.163 No            
 8          0 Former  1320_at              2.23   0.364 No            
 9          0 Former  1405_i_at            6.71   0.903 No            
10          0 Former  1431_at              2.23   0.215 No            
# ℹ 4,550,666 more rows
# ℹ 2 more variables: log2_fold_change_avg <dbl>, log2_fold_change_sample <dbl>

Analysis 2

In this analysis, we want to explore the potential relationship between smoking habits and the altered regulation of gene expression. Each gene in our data set represents variation in gene expression across different patients. Interestingly, many genes exhibit both up-regulation and down-regulation patterns, which vary among individual patients.

We select the top 13 most significant genes from our previous analysis. To further investigate the regulation of gene expression for each patient with metastasis, we calculate a Log2 Fold Change value for each patient. This was calculated by comparing the gene expression in samples with metastasis to the average gene expression across samples without metastasis for each gene. The formula for a specific gene in observation X is:

\[ \text{Log2 Fold Change}(X) = \log_2\left(\frac{\text{gene expression}_{\text{metastasis}}(X)}{\text{mean}(\text{gene expression}_{\text{no-metastasis}})}\right) \]

This figure shows the top 13 most significant genes, and their average Log2 Fold Change value (circle), together with their lower and upper 95% confidence intervals (bars).

Conclusion of the second analysis:

From this figure, we can see that 5 genes are slightly up-regulated, and the rest 8 genes are down-regulated. For the gene 206561_s_at, being the most down-regulated, we also observe larger variability across these samples which is reflected in a larger error-bar.

conf_int_plot <- data |>
  group_by(gene) |>
  filter(Metastasis == 1) |>
  filter(p_value < 0.001) |>
  ggplot(aes(x = log2_fold_change_sample,
             y = reorder(gene, 
                         log2_fold_change_sample, 
                         FUN = mean))) + 
  stat_summary(fun.data = mean_cl_normal, 
               geom = "point", 
               shape = 20, 
               size = 2, 
               color = "black") +
  stat_summary(fun.data = mean_cl_normal, 
               geom = "errorbar", 
               width = 0.7, 
               color = "black", 
               size = 1) + 
  geom_vline(xintercept = 0, 
             linetype="dotted", 
             color = "blue", 
             size=1) + 
  xlab("Log2 Fold Change (95% CIs)") + 
  ylab("Genes") + 
  ggtitle("Regulation of Genes Associated with Metastasis in Bladder Cancer")

ggsave(
  filename = "../results/06_conf_int_plot.png",
  plot = conf_int_plot,
  device = "png",
  height = 4,
  dpi = 300  
)
Saving 7 x 4 in image
print(conf_int_plot)

dev.off()
null device 
          1 

We now want to investigate the proportion of Current Smokers, Former Smokers, and Non-smokers for each gene. Additionally, we want to compare these proportions based on how the gene expression is regulated (whether a gene is up-regulated or down-regulated).

smoking_plot <- data |>
  filter(Metastasis == 1) |>
  filter(p_value < 0.001) |>
  mutate(Regulation = case_when(log2_fold_change_sample > 0 ~ "Samples with upregulation",
                                 log2_fold_change_sample < 0 ~ "Samples with downregulation"),
         Avg_Regulation = case_when(log2_fold_change_avg < 0 ~ "Risk if downregulated",
                                    log2_fold_change_avg > 0 ~ "Risk if upregulated")) |>
    ggplot(aes(y = reorder(gene,
                         log2_fold_change_sample, 
                         FUN = mean),
               fill = Smoking)) +
      geom_bar() +
      facet_grid(Avg_Regulation ~ Regulation, scales="free", space = "free") +
      labs(x = "Percentage", y = "Genes") +
      ggtitle("Relationship Between Smoking Habits and the Altered Gene Expression") 

ggsave(
  filename = "../results/06_smoking_plot.png",
  plot = smoking_plot,
  device = "png",
  height = 4,
  dpi = 300  
)
Saving 7 x 4 in image
print(smoking_plot)

dev.off()
null device 
          1